{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from os import listdir\n",
    "from numpy import array\n",
    "from keras.preprocessing.text import Tokenizer, one_hot\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Model\n",
    "from keras.utils import to_categorical\n",
    "from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense, Flatten\n",
    "from keras.preprocessing.image import array_to_img, img_to_array, load_img\n",
    "from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the images and preprocess them for inception-resnet\n",
    "images = []\n",
    "all_filenames = listdir('resources/images/')\n",
    "all_filenames.sort()\n",
    "for filename in all_filenames:\n",
    "    images.append(img_to_array(load_img('resources/images/'+filename, target_size=(299, 299))))\n",
    "images = np.array(images, dtype=float)\n",
    "images = preprocess_input(images)\n",
    "\n",
    "# Run the images through inception-resnet and extract the features without the classification layer\n",
    "IR2 = InceptionResNetV2(weights=None, include_top=False)\n",
    "IR2.load_weights('/data/models/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5')\n",
    "features = IR2.predict(images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will cap each input sequence to 100 tokens\n",
    "max_caption_len = 100\n",
    "# Initialize the function that will create our vocabulary \n",
    "tokenizer = Tokenizer(filters='', split=\" \", lower=False)\n",
    "\n",
    "# Read a document and return a string\n",
    "def load_doc(filename):\n",
    "    file = open(filename, 'r')\n",
    "    text = file.read()\n",
    "    file.close()\n",
    "    return text\n",
    "\n",
    "# Load all the HTML files\n",
    "X = []\n",
    "all_filenames = listdir('resources/html/')\n",
    "all_filenames.sort()\n",
    "for filename in all_filenames:\n",
    "    X.append(load_doc('resources/html/'+filename))\n",
    "\n",
    "# Create the vocabulary from the html files\n",
    "tokenizer.fit_on_texts(X)\n",
    "\n",
    "# Add +1 to leave space for empty words\n",
    "vocab_size = len(tokenizer.word_index) + 1\n",
    "# Translate each word in text file to the matching vocabulary index\n",
    "sequences = tokenizer.texts_to_sequences(X)\n",
    "# The longest HTML file\n",
    "max_length = max(len(s) for s in sequences)\n",
    "\n",
    "# Intialize our final input to the model\n",
    "X, y, image_data = list(), list(), list()\n",
    "for img_no, seq in enumerate(sequences):\n",
    "    for i in range(1, len(seq)):\n",
    "        # Add the entire sequence to the input and only keep the next word for the output\n",
    "        in_seq, out_seq = seq[:i], seq[i]\n",
    "        # If the sentence is shorter than max_length, fill it up with empty words\n",
    "        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]\n",
    "        # Map the output to one-hot encoding\n",
    "        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]\n",
    "        # Add and image corresponding to the HTML file\n",
    "        image_data.append(features[img_no])\n",
    "        # Cut the input sentence to 100 tokens, and add it to the input data\n",
    "        X.append(in_seq[-100:])\n",
    "        y.append(out_seq)\n",
    "\n",
    "X, y, image_data = np.array(X), np.array(y), np.array(image_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the encoder\n",
    "image_features = Input(shape=(8, 8, 1536,))\n",
    "image_flat = Flatten()(image_features)\n",
    "image_flat = Dense(128, activation='relu')(image_flat)\n",
    "ir2_out = RepeatVector(max_caption_len)(image_flat)\n",
    "\n",
    "language_input = Input(shape=(max_caption_len,))\n",
    "language_model = Embedding(vocab_size, 200, input_length=max_caption_len)(language_input)\n",
    "language_model = LSTM(256, return_sequences=True)(language_model)\n",
    "language_model = LSTM(256, return_sequences=True)(language_model)\n",
    "language_model = TimeDistributed(Dense(128, activation='relu'))(language_model)\n",
    "\n",
    "# Create the decoder\n",
    "decoder = concatenate([ir2_out, language_model])\n",
    "decoder = LSTM(512, return_sequences=False)(decoder)\n",
    "decoder_output = Dense(vocab_size, activation='softmax')(decoder)\n",
    "\n",
    "# Compile the model\n",
    "model = Model(inputs=[image_features, language_input], outputs=decoder_output)\n",
    "model.compile(loss='categorical_crossentropy', optimizer='rmsprop')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the neural network\n",
    "model.fit([image_data, X], y, batch_size=64, shuffle=False, epochs=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map an integer to a word\n",
    "def word_for_id(integer, tokenizer):\n",
    "    for word, index in tokenizer.word_index.items():\n",
    "        if index == integer:\n",
    "            return word\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate a description for an image\n",
    "def generate_desc(model, tokenizer, photo, max_length):\n",
    "    # seed the generation process\n",
    "    in_text = 'START'\n",
    "    # iterate over the whole length of the sequence\n",
    "    for i in range(900):\n",
    "        # integer encode input sequence\n",
    "        sequence = tokenizer.texts_to_sequences([in_text])[0][-100:]\n",
    "        # pad input\n",
    "        sequence = pad_sequences([sequence], maxlen=max_length)\n",
    "        # predict next word\n",
    "        yhat = model.predict([photo,sequence], verbose=0)\n",
    "        # convert probability to integer\n",
    "        yhat = np.argmax(yhat)\n",
    "        # map integer to word\n",
    "        word = word_for_id(yhat, tokenizer)\n",
    "        # stop if we cannot map the word\n",
    "        if word is None:\n",
    "            break\n",
    "        # append as input for generating the next word\n",
    "        in_text += ' ' + word\n",
    "        # Print the prediction\n",
    "        print(' ' + word, end='')\n",
    "        # stop if we predict the end of the sequence\n",
    "        if word == 'END':\n",
    "            break\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and image, preprocess it for IR2, extract features and generate the HTML\n",
    "test_image = img_to_array(load_img('resources/images/86.jpg', target_size=(299, 299)))\n",
    "test_image = np.array(test_image, dtype=float)\n",
    "test_image = preprocess_input(test_image)\n",
    "test_features = IR2.predict(np.array([test_image]))\n",
    "generate_desc(model, tokenizer, np.array(test_features), 100)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
